In this project, we shall build a model which predicts the outcome of cricket matches in the Indian Premier League using data about matches and deliveries.
In [1]:
%matplotlib inline
import numpy as np # imports a fast numerical programming library
import matplotlib.pyplot as plt #sets up plotting under plt
import pandas as pd #lets us handle data as dataframes
#sets up pandas table display
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set(style="whitegrid", color_codes=True)
from __future__ import division
In [2]:
# Reading in the data
allmatches = pd.read_csv("../data/matches.csv")
alldeliveries = pd.read_csv("../data/deliveries.csv")
allmatches.head(10)
Out[2]:
In [3]:
# Selecting Seasons 2008 - 2015
matches_seasons = allmatches.loc[allmatches['season'] != 2016]
deliveries_seasons = alldeliveries.loc[alldeliveries['match_id'] < 518]
In [4]:
# Selecting teams DD, KKR, MI, RCB, KXIP, RR, CSK
matches_teams = matches_seasons.loc[(matches_seasons['team1'].isin(['Kolkata Knight Riders', \
'Royal Challengers Bangalore', 'Delhi Daredevils', 'Chennai Super Kings', 'Rajasthan Royals', \
'Mumbai Indians', 'Kings XI Punjab'])) & (matches_seasons['team2'].isin(['Kolkata Knight Riders', \
'Royal Challengers Bangalore', 'Delhi Daredevils', 'Chennai Super Kings', 'Rajasthan Royals', \
'Mumbai Indians', 'Kings XI Punjab']))]
matches_team_matchids = matches_teams.id.unique()
deliveries_teams = deliveries_seasons.loc[deliveries_seasons['match_id'].isin(matches_team_matchids)]
print "Teams selected:\n"
for team in matches_teams.team1.unique():
print team
In [5]:
# Neglect matches with inconsistencies like 'No Result' or 'D/L Applied'
matches = matches_teams.loc[(matches_teams['result'] == 'normal') & (matches_teams['dl_applied'] == 0)]
matches_matchids = matches.id.unique()
deliveries = deliveries_teams.loc[deliveries_teams['match_id'].isin(matches_matchids)]
# Verifying consistency between datasets
(matches.id.unique() == deliveries.match_id.unique()).all()
Out[5]:
In [6]:
# Batsman Strike Rate Calculation
# Team 1: Batting First; Team 2: Fielding First
def getMatchDeliveriesDF(match_id):
return deliveries.loc[deliveries['match_id'] == match_id]
def getInningsOneBatsmen(match_deliveries):
return match_deliveries.loc[match_deliveries['inning'] == 1].batsman.unique()[0:5]
def getInningsTwoBatsmen(match_deliveries):
return match_deliveries.loc[match_deliveries['inning'] == 2].batsman.unique()[0:5]
def getBatsmanStrikeRate(batsman, match_id):
onstrikedeliveries = deliveries.loc[(deliveries['match_id'] < match_id) & (deliveries['batsman'] == batsman)]
total_runs = onstrikedeliveries['batsman_runs'].sum()
total_balls = onstrikedeliveries.shape[0]
if total_balls != 0:
return (total_runs/total_balls) * 100
else:
return None
def getTeamStrikeRate(batsmen, match_id):
strike_rates = []
for batsman in batsmen:
bsr = getBatsmanStrikeRate(batsman, match_id)
if bsr != None:
strike_rates.append(bsr)
return np.mean(strike_rates)
def getAverageStrikeRates(match_id):
match_deliveries = getMatchDeliveriesDF(match_id)
innOneBatsmen = getInningsOneBatsmen(match_deliveries)
innTwoBatsmen = getInningsTwoBatsmen(match_deliveries)
teamOneSR = getTeamStrikeRate(innOneBatsmen, match_id)
teamTwoSR = getTeamStrikeRate(innTwoBatsmen, match_id)
return teamOneSR, teamTwoSR
In [7]:
# Testing Functionality
getAverageStrikeRates(517)
Out[7]:
In [8]:
# Bowler Rating : Wickets/Run (Higher the Better)
# Team 1: Batting First; Team 2: Fielding First
def getInningsOneBowlers(match_deliveries):
return match_deliveries.loc[match_deliveries['inning'] == 1].bowler.unique()[0:4]
def getInningsTwoBowlers(match_deliveries):
return match_deliveries.loc[match_deliveries['inning'] == 2].bowler.unique()[0:4]
def getBowlerWPR(bowler, match_id):
balls = deliveries.loc[(deliveries['match_id'] < match_id) & (deliveries['bowler'] == bowler)]
total_runs = balls['total_runs'].sum()
total_wickets = balls.loc[balls['dismissal_kind'].isin(['caught', 'bowled', 'lbw', \
'caught and bowled', 'stumped'])].shape[0]
if balls.shape[0] > 0:
return (total_wickets/total_runs) * 100
else:
return None
def getTeamWPR(bowlers, match_id):
WPRs = []
for bowler in bowlers:
bwpr = getBowlerWPR(bowler, match_id)
if bwpr != None:
WPRs.append(bwpr)
return np.mean(WPRs)
def getAverageWPR(match_id):
match_deliveries = getMatchDeliveriesDF(match_id)
innOneBowlers = getInningsOneBowlers(match_deliveries)
innTwoBowlers = getInningsTwoBowlers(match_deliveries)
teamOneWPR = getTeamWPR(innTwoBowlers, match_id)
teamTwoWPR = getTeamWPR(innOneBowlers, match_id)
return teamOneWPR, teamTwoWPR
In [9]:
# testing functionality
getAverageWPR(517)
Out[9]:
In [10]:
# MVP Score (Total number of Player of the Match awards in a squad)
def getAllInningsOneBatsmen(match_deliveries):
return match_deliveries.loc[match_deliveries['inning'] == 1].batsman.unique()
def getAllInningsTwoBatsmen(match_deliveries):
return match_deliveries.loc[match_deliveries['inning'] == 2].batsman.unique()
def getAllInningsOneBowlers(match_deliveries):
return match_deliveries.loc[match_deliveries['inning'] == 1].bowler.unique()
def getAllInningsTwoBowlers(match_deliveries):
return match_deliveries.loc[match_deliveries['inning'] == 2].bowler.unique()
def makeSquad(batsmen, bowlers):
p = []
p = np.append(p, batsmen)
for i in bowlers:
if i not in batsmen:
p = np.append(p, i)
return p
def getPlayerMVPAwards(player, match_id):
return matches.loc[(matches['player_of_match'] == player) & (matches['id'] < match_id)].shape[0]
def getTeamMVPAwards(squad, match_id):
num_awards = 0
for player in squad:
num_awards += getPlayerMVPAwards(player, match_id)
return num_awards
def compareMVPAwards(match_id):
match_deliveries = getMatchDeliveriesDF(match_id)
innOneBatsmen = getAllInningsOneBatsmen(match_deliveries)
innTwoBatsmen = getAllInningsTwoBatsmen(match_deliveries)
innOneBowlers = getAllInningsOneBowlers(match_deliveries)
innTwoBowlers = getAllInningsTwoBowlers(match_deliveries)
teamOneSquad = makeSquad(innOneBatsmen, innTwoBowlers)
teamTwoSquad = makeSquad(innTwoBatsmen, innOneBowlers)
teamOneAwards = getTeamMVPAwards(teamOneSquad, match_id)
teamTwoAwards = getTeamMVPAwards(teamTwoSquad, match_id)
return teamOneAwards, teamTwoAwards
In [11]:
compareMVPAwards(517)
Out[11]:
In [12]:
# Prints a comparison between two teams based on squad attributes
def generateSquadRating(match_id):
gameday_teams = deliveries.loc[(deliveries['match_id'] == match_id)].batting_team.unique()
teamOne = gameday_teams[0]
teamTwo = gameday_teams[1]
teamOneSR, teamTwoSR = getAverageStrikeRates(match_id)
teamOneWPR, teamTwoWPR = getAverageWPR(match_id)
teamOneMVPs, teamTwoMVPs = compareMVPAwards(match_id)
print "Comparing squads for {} vs {}".format(teamOne,teamTwo)
print "\nAverage Strike Rate for Batsmen in {} : {}".format(teamOne,teamOneSR)
print "\nAverage Strike Rate for Batsmen in {} : {}".format(teamTwo,teamTwoSR)
print "\nBowler Rating (W/R) for {} : {}".format(teamOne,teamOneWPR)
print "\nBowler Rating (W/R) for {} : {}".format(teamTwo,teamTwoWPR)
print "\nNumber of MVP Awards in {} : {}".format(teamOne,teamOneMVPs)
print "\nNumber of MVP Awards in {} : {}".format(teamTwo,teamTwoMVPs)
In [13]:
#Testing Functionality
generateSquadRating(517)
In [14]:
## 2nd Feature : Previous Encounter
# Won by runs and won by wickets (Higher the better)
def getTeam1(match_id):
return matches.loc[matches["id"] == match_id].team1.unique()
def getTeam2(match_id):
return matches.loc[matches["id"] == match_id].team2.unique()
def getPreviousEncDF(match_id):
team1 = getTeam1(match_id)
team2 = getTeam2(match_id)
return matches.loc[(matches["id"] < match_id) & (((matches["team1"].isin(team1)) & (matches["team2"].isin(team2))) | ((matches["team1"].isin(team2)) & (matches["team2"].isin(team1))))]
def getTeamWBR(match_id, team):
WBR = 0
DF = getPreviousEncDF(match_id)
winnerDF = DF.loc[DF["winner"] == team]
WBR = winnerDF['win_by_runs'].sum()
return WBR
def getTeamWBW(match_id, team):
WBW = 0
DF = getPreviousEncDF(match_id)
winnerDF = DF.loc[DF["winner"] == team]
WBW = winnerDF['win_by_wickets'].sum()
return WBW
def getTeamWinPerc(match_id):
dF = getPreviousEncDF(match_id)
timesPlayed = dF.shape[0]
team1 = getTeam1(match_id)[0].strip("[]")
timesWon = dF.loc[dF["winner"] == team1].shape[0]
if timesPlayed != 0:
winPerc = (timesWon/timesPlayed) * 100
else:
winPerc = 0
return winPerc
def getBothTeamStats(match_id):
DF = getPreviousEncDF(match_id)
team1 = getTeam1(match_id)[0].strip("[]")
team2 = getTeam2(match_id)[0].strip("[]")
timesPlayed = DF.shape[0]
timesWon = DF.loc[DF["winner"] == team1].shape[0]
WBRTeam1 = getTeamWBR(match_id, team1)
WBRTeam2 = getTeamWBR(match_id, team2)
WBWTeam1 = getTeamWBW(match_id, team1)
WBWTeam2 = getTeamWBW(match_id, team2)
print "Out of {} times in the past {} have won {} times({}%) from {}".format(timesPlayed, team1, timesWon, getTeamWinPerc(match_id), team2)
print "{} won by {} total runs and {} total wickets.".format(team1, WBRTeam1, WBWTeam1)
print "{} won by {} total runs and {} total wickets.".format(team2, WBRTeam2, WBWTeam2)
In [15]:
#Testing functionality
getBothTeamStats(517)
In [16]:
# 3rd Feature: Recent Form (Win Percentage of 3 previous matches of a team in the same season)
# Higher the better
def getMatchYear(match_id):
return matches.loc[matches["id"] == match_id].season.unique()
def getTeam1DF(match_id, year):
team1 = getTeam1(match_id)
return matches.loc[(matches["id"] < match_id) & (matches["season"] == year) & ((matches["team1"].isin(team1)) | (matches["team2"].isin(team1)))].tail(3)
def getTeam2DF(match_id, year):
team2 = getTeam2(match_id)
return matches.loc[(matches["id"] < match_id) & (matches["season"] == year) & ((matches["team1"].isin(team2)) | (matches["team2"].isin(team2)))].tail(3)
def getTeamWinPercentage(match_id):
year = int(getMatchYear(match_id))
team1 = getTeam1(match_id)[0].strip("[]")
team2 = getTeam2(match_id)[0].strip("[]")
team1DF = getTeam1DF(match_id, year)
team2DF = getTeam2DF(match_id, year)
team1TotalMatches = team1DF.shape[0]
team1WinMatches = team1DF.loc[team1DF["winner"] == team1].shape[0]
team2TotalMatches = team2DF.shape[0]
team2WinMatches = team2DF.loc[team2DF["winner"] == team2].shape[0]
if (team1TotalMatches != 0) and (team2TotalMatches !=0):
winPercTeam1 = ((team1WinMatches / team1TotalMatches) * 100)
winPercTeam2 = ((team2WinMatches / team2TotalMatches) * 100)
elif (team1TotalMatches != 0) and (team2TotalMatches ==0):
winPercTeam1 = ((team1WinMatches / team1TotalMatches) * 100)
winPercTeam2 = 0
elif (team1TotalMatches == 0) and (team2TotalMatches !=0):
winPercTeam1 = 0
winPercTeam2 = ((team2WinMatches / team2TotalMatches) * 100)
else:
winPercTeam1 = 0
winPercTeam2 = 0
return winPercTeam1, winPercTeam2
In [17]:
#Testing Functionality
getTeamWinPercentage(517)
Out[17]:
In [18]:
#Function to implement all features
def getAllFeatures(match_id):
generateSquadRating(match_id)
print ("\n")
getBothTeamStats(match_id)
print("\n")
getTeamWinPercentage(match_id)
In [19]:
#Testing Functionality
getAllFeatures(517)
In [20]:
#Create Column for Team 1 Winning Status (1 = Won, 0 = Lost)
matches['team1Winning'] = np.where(matches['team1'] == matches['winner'], 1, 0)
In [21]:
# New Column for Difference of Average Strike rates (First Team SR - Second Team SR)
# [Negative value means Second team is better]
firstTeamSR = []
secondTeamSR = []
for i in matches['id'].unique():
P, Q = getAverageStrikeRates(i)
firstTeamSR.append(P), secondTeamSR.append(Q)
firstSRSeries = pd.Series(firstTeamSR)
secondSRSeries = pd.Series(secondTeamSR)
matches["Avg_SR_Difference"] = firstSRSeries.values - secondSRSeries.values
In [22]:
# New Column for Difference of Wickets Per Run (First Team WPR - Second Team WPR)
# [Negative value means Second team is better]
firstTeamWPR = []
secondTeamWPR = []
for i in matches['id'].unique():
R, S = getAverageWPR(i)
firstTeamWPR.append(R), secondTeamWPR.append(S)
firstWPRSeries = pd.Series(firstTeamWPR)
secondWPRSeries = pd.Series(secondTeamWPR)
matches["Avg_WPR_Difference"] = firstWPRSeries.values - secondWPRSeries.values
In [23]:
# New column for difference of MVP Awards
# (Negative value means Second team is better)
firstTeamMVP = []
secondTeamMVP = []
for i in matches['id'].unique():
T, U = compareMVPAwards(i)
firstTeamMVP.append(T), secondTeamMVP.append(U)
firstMVPSeries = pd.Series(firstTeamMVP)
secondMVPSeries = pd.Series(secondTeamMVP)
matches["Total_MVP_Difference"] = firstMVPSeries.values - secondMVPSeries.values
In [24]:
# New column for Win Percentage of Team 1 in previous encounters
firstTeamWP = []
for i in matches['id'].unique():
WP = getTeamWinPerc(i)
firstTeamWP.append(WP)
firstWPSeries = pd.Series(firstTeamWP)
matches["Prev_Enc_Team1_WinPerc"] = firstWPSeries.values
In [25]:
# New column for Recent form(Win Percentage in the current season) of 1st Team compared to 2nd Team
# (Negative means 2nd team has higher win percentage)
firstTeamRF = []
secondTeamRF = []
for i in matches['id'].unique():
K, L = getTeamWinPercentage(i)
firstTeamRF.append(K), secondTeamRF.append(L)
firstRFSeries = pd.Series(firstTeamRF)
secondRFSeries = pd.Series(secondTeamRF)
matches["Total_RF_Difference"] = firstRFSeries.values - secondRFSeries.values
In [26]:
#Testing
matches.tail()
Out[26]:
In [27]:
# Graph for Average Strike Rate Difference
matches.boxplot(column = 'Avg_SR_Difference', by='team1Winning', showfliers= False)
Out[27]:
In [28]:
# Graph for Average WPR(Wickets per Run) Difference
matches.boxplot(column = 'Avg_WPR_Difference', by='team1Winning', showfliers= False)
Out[28]:
In [29]:
# Graph for MVP Difference
matches.boxplot(column = 'Total_MVP_Difference', by='team1Winning', showfliers= False)
Out[29]:
In [30]:
#Graph for Previous encounters Win Percentage of Team #1
matches.boxplot(column = 'Prev_Enc_Team1_WinPerc', by='team1Winning', showfliers= False)
Out[30]:
In [31]:
# Graph for Recent form(Win Percentage in the same season)
matches.boxplot(column = 'Total_RF_Difference', by='team1Winning', showfliers= False)
Out[31]: